# install.packages('Seurat')
# remotes::install_github('chris-mcginnis-ucsf/DoubletFinder')
# install.packages("devtools")
# BiocManager::install("SingleCellExperiment")
# devtools::install_github("immunogenomics/harmony")
library(Seurat)
## Registered S3 method overwritten by 'spatstat.core':
## method from
## formula.glmmPQL MASS
## Attaching SeuratObject
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DoubletFinder)
library(cowplot)
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
##
## Assays
## The following object is masked from 'package:Seurat':
##
## Assays
library(harmony)
## Loading required package: Rcpp
I have a single cell gene expression data for two samples: skin
cancer Melanoma cell lines A375 and A375_treated. The pre-processing of
the data is described in my earlier tutorial using 10X’s cellranger pipe
line at: https://github.com/githubrudramani/Pipelines/blob/main/sc-RNAseq/sc-RNAseq%20cellrnager.pdf
Lets load the data from directory
setwd("/Users/rudramanipokhrel/work/Pipelines/sc_Rnaseq/Melanoma")
# Clear memory
rm(list = ls())
# Read 10X data and create Seurat object
# load A375 cell line data
data1 <- Read10X(data.dir = "A375_count/outs/filtered_feature_bc_matrix/")
A375 <- CreateSeuratObject(counts = data1, project = "A375", min.cells = 3, min.features = 200)
data2 <- Read10X(data.dir = "A375_treated_count/outs/filtered_feature_bc_matrix/")
A375_treated <- CreateSeuratObject(counts = data2, project = "A375_treated", min.cells = 3, min.features = 200)
dim(A375)
## [1] 21542 10870
dim(A375_treated)
## [1] 21801 9256
# Note min.cells is recommned 0.1% of total cells in the data. In this case the data has ~10000 cells.
# For now this parameters works. We will filter them further in upcoming steps.
Tutorial link:
http://barc.wi.mit.edu/education/hot_topics/scRNAseq_2020/SingleCell_Seurat_2020.html
First look at A375 sample
count1 <- A375@assays$RNA@counts
counts_per_cell1 <- Matrix::colSums(count1)
counts_per_gene1 <- Matrix::rowSums(count1)
genes_per_cell1 <- Matrix::colSums(count1>0) # count a gene only if it has non-zero reads mapped.
cells_per_gene1 <- Matrix::rowSums(count1>0) # only count cells where the gene is expressed
#Plot cells ranked by their number of detected genes.
plot(sort(genes_per_cell1), xlab='cell', log='y', main='genes per cell (ordered)')
# plot minimum and maximum cut-off
abline(h=350, col='blue') # lower threshold
abline(h=5000, col='red') # upper threshold
we can remove the cell having less than 350 genes and more than 5000
genes Extremely high number of detected genes could indicate doublets.
However, depending on the cell type composition in your sample, you may
have cells with higher number of genes (and also higher counts) from one
cell type. In this case, we will run doublet prediction further down
Examine percentag of mitochondrial read
content
In a cell, the % of read from mitochondrial genome is indicative of
quality of cells. High fraction of mitochondrial content might be due to
damaged cells during tissue isolation, because if cytoplasmic mRNA has
leaked out through a broken membrane, only mRNA located in the
mitochondria is still conserved. However specific biology of interest
may be attributed to the high mitochondrial counts. In general it is
wise to remove cells having high mitochondrial contents. get
mitocondrial genes in count matrix
mito_genes1 <- grep("^mt-", rownames(count1) , ignore.case=T, value=T)
# compute mitochondrial percentages
mito_gene_counts1 = Matrix::colSums(count1[mito_genes1,])
pct_mito1 = mito_gene_counts1 / counts_per_cell1 * 100
plot(sort(pct_mito1), xlab = "cells sorted by percentage mitochondrial counts", ylab = "percentage mitochondrial counts")
cutoff <- median(pct_mito1) + 2 *sd(pct_mito1)
cutoff
## [1] 34.04232
abline(h=cutoff, col='orange')
# we can remove the cell mt content more than cutoff
A375[["percent.mt"]] <- PercentageFeatureSet(object = A375, pattern = "^MT-")
# Make violin plot
VlnPlot(A375, c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.1)
# Apply the filter:
A375_filt <- subset(A375, subset =
nCount_RNA < 20000 &
nFeature_RNA > 350 &
percent.mt < cutoff)
VlnPlot(A375_filt, c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.1)
### Filtering the dublets Doublets/Mulitples of cells in the same
well/droplet is a common issue in scRNAseq protocols. Especially in
droplet-based methods whith overloading of cells. In a typical 10x
experiment the proportion of doublets is linearly dependent on the
amount of loaded cells. As indicated from the Chromium user guide: https://assets.ctfassets.net/an68im79xiti/1eX2FPdpeCgnCJtw4fj9Hx/7cb84edaa9eca04b607f9193162994de/CG000204_ChromiumNextGEMSingleCell3_v3.1_Rev_D.pdf
For my data I am using doublet rates for ~10000 recovered cell as 7.6%.
Follow this tutorial for more detail:
https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/seurat/seurat_01_qc.html#Predict_doublets
A375_filt = NormalizeData(A375_filt)
A375_filt = FindVariableFeatures(A375_filt,selection.method = "vst", nfeatures = 2500, verbose = F)
all.genes <- rownames(A375_filt)
A375_filt = ScaleData(A375_filt,features = all.genes, verbose = F)
A375_filt = RunPCA(A375_filt, verbose = F, npcs = 20)
A375_filt = RunUMAP(A375_filt, dims = 1:10, verbose = F)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
nExp <- round(ncol(A375_filt) * 0.076) # expect 7.6% doublets
A375_filt <- doubletFinder_v3(A375_filt, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
## Loading required package: fields
## Loading required package: spam
## Spam version 2.8-0 (2022-01-05) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following object is masked from 'package:stats4':
##
## mle
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: viridis
## Loading required package: viridisLite
##
## Try help(fields) to get started.
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## [1] "Creating 3366 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
# Name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(A375_filt@meta.data)[grepl("DF.classification", colnames(A375_filt@meta.data))]
DimPlot(A375_filt) + NoAxes()
cowplot::plot_grid(ncol = 2, DimPlot(A375_filt, group.by = "orig.ident") + NoAxes(),
DimPlot(A375_filt, group.by = DF.name) + NoAxes())
VlnPlot(A375_filt, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)
dim(A375_filt)
## [1] 21542 10099
A375_filt = A375_filt[, A375_filt@meta.data[, DF.name] == "Singlet"]
dim(A375_filt)
## [1] 21542 9331
VlnPlot(A375_filt, features = "nFeature_RNA", pt.size = 0.1)
count2 <- A375_treated@assays$RNA@counts
counts_per_cell2 <- Matrix::colSums(count2)
counts_per_gene2 <- Matrix::rowSums(count2)
genes_per_cell2 <- Matrix::colSums(count2>0) # count a gene only if it has non-zero reads mapped.
cells_per_gene2 <- Matrix::rowSums(count2>0) # only count cells where the gene is expressed
#Plot cells ranked by their number of detected genes.
plot(sort(genes_per_cell2), xlab='cell', log='y', main='genes per cell (ordered)')
# plot minimum and maximum cut-off
abline(h=350, col='blue') # lower threshold
abline(h=5000, col='red') # upper threshold
#Examine percent mitochondrial read content.
mito_genes2 <- grep("^mt-", rownames(count2) , ignore.case=T, value=T)
mito_genes2
## [1] "MT-ND1" "MT-ND2" "MT-CO1" "MT-CO2" "MT-ATP8" "MT-ATP6" "MT-CO3"
## [8] "MT-ND3" "MT-ND4L" "MT-ND4" "MT-ND5" "MT-ND6" "MT-CYB"
# compute mitochondrial percentages
mito_gene_counts2 = Matrix::colSums(count2[mito_genes2,])
pct_mito2 = mito_gene_counts2 / counts_per_cell2 * 100
plot(sort(pct_mito2), xlab = "cells sorted by percentage mitochondrial counts", ylab = "percentage mitochondrial counts")
cutoff2 <- median(pct_mito2) + 2 *sd(pct_mito2)
abline(h=cutoff2, col='orange')
# Implementing inbuilt seurat packages to to qc and filtering:
A375_treated[["percent.mt"]] <- PercentageFeatureSet(object = A375_treated, pattern = "^MT-")
# Make violin plot
VlnPlot(A375_treated, c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.1)
# Apply the filter:
A375_treated_filt <- subset(A375_treated, subset =
nCount_RNA < 20000 &
nFeature_RNA > 350 &
percent.mt < cutoff)
VlnPlot(A375_treated_filt, c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.1)
## ----------------------------------------------------------------------------------------
# Filtering the dublets
## ----------------------------------------------------------------------------------------
A375_treated_filt = NormalizeData(A375_treated_filt)
A375_treated_filt = FindVariableFeatures(A375_treated_filt, selection.method = "vst", nfeatures = 2500, verbose = F)
all.genes = rownames(A375_treated_filt)
A375_treated_filt = ScaleData(A375_treated_filt, features = all.genes, verbose = F)
A375_treated_filt = RunPCA(A375_treated_filt, verbose = F, npcs = 20)
A375_treated_filt = RunUMAP(A375_treated_filt, dims = 1:10, verbose = F)
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
nExp <- round(ncol(A375_treated_filt) * 0.076) # expect 7.6% doublets
A375_treated_filt <- doubletFinder_v3(A375_treated_filt, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
## [1] "Creating 2924 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
DF.name = colnames(A375_treated_filt@meta.data)[grepl("DF.classification", colnames(A375_treated_filt@meta.data))]
DimPlot(A375_treated_filt) + NoAxes()
cowplot::plot_grid(ncol = 2, DimPlot(A375_treated_filt, group.by = "orig.ident") + NoAxes(),
DimPlot(A375_treated_filt, group.by = DF.name) + NoAxes())
VlnPlot(A375_treated_filt, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)
A375_treated_filt = A375_treated_filt[,A375_treated_filt@meta.data[, DF.name] == "Singlet"]
VlnPlot(A375_treated_filt, features = "nFeature_RNA", pt.size = 0.1)
Lets see how many cells we filtered
dim(A375)
## [1] 21542 10870
dim(A375_treated)
## [1] 21801 9256
dim(A375_filt)
## [1] 21542 9331
dim(A375_treated_filt)
## [1] 21801 8106
Save the objects in so that we don’t need to run the process again.
saveRDS(A375_filt, file = "A375_filt.rds")
saveRDS(A375_treated_filt, file = "A375_treated_filt.rds")
For samples prepared at different time frames at different kits, we can integrate using various packages to remove the batch or biases. First, I am going to use seurat integrate package which uses Canonical correlation analysis (CCA). The method selects features that are repeatedly variable across datasets for integration. Remember to normalize and identify variable features for each dataset independently.
obj.list <- c(A375_filt, A375_treated_filt)
features <- SelectIntegrationFeatures(object.list = obj.list)
anchors <- FindIntegrationAnchors(object.list = obj.list, anchor.features = features)
## Warning in CheckDuplicateCellNames(object.list = object.list): Some cell names
## are duplicated across objects provided. Renaming to enforce unique cell names.
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 26501 anchors
## Filtering anchors
## Retained 7410 anchors
seurat_integrated <- IntegrateData(anchorset = anchors)
## Merging dataset 2 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
DefaultAssay(seurat_integrated)
## [1] "integrated"
NOTE: if you look at your seurat object, the default assay as changed
from ‘RNA’ to ‘integrated’ this can be change anytime using the line
below this would be the same way you would change between scRNA-seq and
scATAC-seq
DefaultAssay(seurat_integrated) <- “RNA”
Now run the standard workflow for visualization and clustering
#seurat_integrated <- ScaleData(seurat_integrated, vars.to.regress = c("nCount_RNA"), verbose = FALSE)
seurat_integrated <- ScaleData(seurat_integrated, verbose = FALSE)
seurat_integrated <- RunPCA(seurat_integrated, npcs = 30, verbose = FALSE)
seurat_integrated <- RunUMAP(seurat_integrated, reduction = "pca", dims = 1:30)
## 10:55:22 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 10:55:22 Read 17437 rows and found 30 numeric columns
## 10:55:22 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 10:55:22 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:55:23 Writing NN index file to temp file /var/folders/ss/g2916l0d47qf7gp8xrgslkmw0000gr/T//RtmpF2DBAC/filebbbf560b86c2
## 10:55:23 Searching Annoy index using 1 thread, search_k = 3000
## 10:55:27 Annoy recall = 100%
## 10:55:27 Commencing smooth kNN distance calibration using 1 thread
## 10:55:29 Initializing from normalized Laplacian + noise
## 10:55:29 Commencing optimization for 200 epochs, with 805966 positive edges
## 10:55:40 Optimization finished
seurat_integrated <- FindNeighbors(seurat_integrated, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
seurat_integrated <- FindClusters(seurat_integrated, resolution = 0.5,
algorithm = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 17437
## Number of edges: 729156
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8826
## Number of communities: 9
## Elapsed time: 2 seconds
# I prefer using Leiden algorithm (4), the default is Louvain (1). You need to install it using
#pip install leidenalg
Let’s see what proportion of our total cells reside in each cluster
prop.table(table(Idents(seurat_integrated)))
##
## 0 1 2 3 4 5 6
## 0.23507484 0.18655732 0.14044847 0.13224752 0.11171646 0.06623846 0.05499799
## 7 8
## 0.04318403 0.02953490
## --------------------------------------------------------------------------------------------------
# 1. sample specific
## --------------------------------------------------------------------------------------------------
pt <- table(Idents(seurat_integrated), seurat_integrated$orig.ident)
pt <- as.data.frame(pt)
pt$Var1 <- as.character(pt$Var1)
pt$Var1 <- factor(pt$Var1, level =c("0", "1", "2" ,"3" , "4" ,"5" , "6" , "7"))
ggplot(pt, aes(x = Var1, y = Freq, fill = Var2)) +
theme_bw(base_size = 15) +
geom_col(position = "fill", width = 0.5) +
xlab("Sample") +
ylab("Proportion") +
#scale_fill_manual(values = brewer.pal(12, "Paired")) +
theme(legend.title = element_blank())
## --------------------------------------------------------------------------------------------------
# 2. cell cycle specific
## --------------------------------------------------------------------------------------------------
# I have created the list of human cell cycle genes from this source:
# "https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Homo_sapiens.csv>"
cc_genes <- read.csv("https://raw.githubusercontent.com/githubrudramani/Bioinformatics/master/cellcyclegenes.csv")
s_genes <- cc_genes[ cc_genes$phase=="S",]$ext_gene
g2m_genes <- cc_genes[ cc_genes$phase=="G2/M",]$ext_gene
seurat_integrated <- CellCycleScoring(object = seurat_integrated, g2m.features = g2m_genes,
s.features = s_genes)
## Warning: The following features are not present in the object: UBR7, RFC2,
## RAD51, MCM2, TIPIN, MCM6, UNG, POLD3, CDC45, MSH2, MCM5, RAD51AP1, GMNN, RPA2,
## CASP8AP2, NASP, DSCC1, CDCA7, CHAF1B, RRM1, FEN1, PRIM1, UHRF1, not searching
## for symbol synonyms
## Warning: The following features are not present in the object: CBX5, RANGAP1,
## CTCF, PIMREG, ANP32E, LBR, CKS1B, JPT1, not searching for symbol synonyms
table(seurat_integrated$seurat_clusters)
##
## 0 1 2 3 4 5 6 7 8
## 4099 3253 2449 2306 1948 1155 959 753 515
pt <- table(Idents(seurat_integrated), seurat_integrated$Phase)
pt <- as.data.frame(pt)
pt$Var1 <- as.character(pt$Var1)
pt$Var1 <- factor(pt$Var1, level =c("0","1", "2" ,"3" , "4" ,"5" , "6" , "7"))
ggplot(pt, aes(x = Var1, y = Freq, fill = Var2)) +
theme_bw(base_size = 15) +
geom_col(position = "fill", width = 0.5) +
xlab("Cluster") +
ylab("Proportion") +
#scale_fill_manual(values = brewer.pal(3, "YlOrBr")) +
theme(legend.title = element_blank())
Let’s see the UMAP and clusters:
DimPlot(seurat_integrated, reduction = "umap", label = TRUE)
DimPlot(seurat_integrated, reduction = "umap",
split.by = "orig.ident", # this facets the plot
group.by = "Phase", # labels the cells with values from your group.by variable
label = FALSE)
DimPlot(seurat_integrated, reduction = "umap",
split.by = "orig.ident", # this facets the plot
group.by = "seurat_clusters", # labels the cells with values from your group.by variable
label = TRUE)
Seurat has differenct functions like FindMarkers to examine statistical analysis between two clusters, FindConservedMarkers to find conserved genes across samples for a given cluster, and FindAllMarkers iteratevely uses FindMarkers to find markers genes for each cluster vs rest.
cluster.markers <- FindAllMarkers(seurat_integrated, only.pos = TRUE,
min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
# let's take the top 5 marker genes for each cluster and plot as a heatmap
top5 <- cluster.markers %>%
group_by(cluster) %>%
top_n(n = 5, wt = avg_log2FC)
DoHeatmap(seurat_integrated, features = top5$gene)
# You can extract markers for particular cluster
cluster0 <- cluster.markers %>% filter(cluster == "0")
cluster0$pct.diff <- cluster0$pct.1 - cluster0$pct.2
cluster0 <- cluster0[order(-cluster0$pct.diff),]
head(cluster0)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene pct.diff
## MTDH 0.000000e+00 0.5856496 0.926 0.551 0.000000e+00 0 MTDH 0.375
## SRM 5.931359e-282 0.3237898 0.811 0.443 1.186272e-278 0 SRM 0.368
## TPI1 1.395593e-270 0.3930449 0.923 0.572 2.791186e-267 0 TPI1 0.351
## GUK1 1.509491e-306 0.2657783 0.801 0.451 3.018983e-303 0 GUK1 0.350
## AP2M1 2.439654e-270 0.3274912 0.763 0.413 4.879308e-267 0 AP2M1 0.350
## PFN1 0.000000e+00 0.4384106 0.941 0.597 0.000000e+00 0 PFN1 0.344
# we can plot gene of interest
genes <- c("TPI1", "MTDH")
FeaturePlot(seurat_integrated,
reduction = "umap",
features = genes,
pt.size = 0.4,
order = TRUE,
split.by = "orig.ident",
min.cutoff = 'q10',
label = FALSE)
VlnPlot(seurat_integrated, features = genes)
# Dot plot of top 2 genes in a cluster
top2 <- cluster.markers %>%
group_by(cluster) %>%
top_n(n = 2, wt = avg_log2FC)
DotPlot(seurat_integrated, features=unique(top2$gene), dot.scale = 6) +
RotatedAxis() +theme(axis.text.x = element_text(color = "black", size = 10))
I am going to find markers genes across samples, as we do in bulk RNAseq
analysis.
seuratDE <- seurat_integrated
Idents(seuratDE) <- "orig.ident"
markersDE <- FindMarkers(object = seuratDE,ident.1 = "A375_treated",
ident.2 = "A375",
min.pct = 0.25)
topDE <- markersDE[(markersDE$avg_log2FC >0.25) | (markersDE$avg_log2FC < -0.25),]
topDE <- topDE[order(-(topDE$avg_log2FC)),]
head(topDE)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## FXYD3 4.332582e-240 0.3199409 0.380 0.056 8.665164e-237
## RAD51B 5.833668e-81 0.2752425 0.789 0.386 1.166734e-77
## NAV2 1.444413e-51 0.2552213 0.872 0.607 2.888825e-48
## MRPL12 2.290478e-75 -0.2548408 0.584 0.392 4.580957e-72
## ANXA1 2.385762e-08 -0.2559625 0.728 0.507 4.771525e-05
## CARHSP1 2.601973e-52 -0.2567430 0.626 0.464 5.203945e-49
tail(topDE)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## RPS20 3.168520e-135 -0.4553616 0.937 0.879 6.337039e-132
## SH3BGRL3 1.323686e-59 -0.4646193 0.679 0.512 2.647372e-56
## TMSB10 5.504026e-101 -0.4682699 0.908 0.827 1.100805e-97
## UBE2S 5.800565e-52 -0.4703558 0.828 0.682 1.160113e-48
## FTL 7.563389e-137 -0.5330453 0.889 0.808 1.512678e-133
## MT2A 6.952119e-68 -0.6062146 0.876 0.783 1.390424e-64
# Upregulated in treated samples
genes <- c("RAD51B", "FXYD3")
FeaturePlot(seuratDE,
reduction = "umap",
features = genes,
pt.size = 0.4,
order = TRUE,
split.by = "orig.ident",
min.cutoff = 'q10',
label = FALSE)
# Downregulated in treated samples
genes <- c("RPS20", "TMSB10")
FeaturePlot(seuratDE,
reduction = "umap",
features = genes,
pt.size = 0.4,
order = TRUE,
split.by = "orig.ident",
min.cutoff = 'q10',
label = FALSE)
genes <- c("RAD51B", "FXYD3", "RPS20", "TMSB10")
DotPlot(seuratDE, features=genes, dot.scale = 6) +
RotatedAxis() +theme(axis.text.x = element_text(color = "black", size = 10))
saveRDS(seurat_integrated, file = "seurat_integrated_CCA.rds")
I always prefer Harmony for seurat objects integration, first it is faster than CCA and studies recommend Harmony integration in comparasion to other methods. You an follow this paper for comparative analysis: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1850-9
# First merged the dublet filtered object
merged_filt <- merge(A375_filt, A375_treated_filt)
## Warning in CheckDuplicateCellNames(object.list = objects): Some cell names are
## duplicated across objects provided. Renaming to enforce unique cell names.
table(merged_filt$orig.ident)
##
## A375 A375_treated
## 9331 8106
# Dimension Reduction
merged_filt <- merged_filt %>%
Seurat::NormalizeData(verbose = FALSE) %>%
FindVariableFeatures(selection.method = "vst", nfeatures = 2500) %>%
ScaleData(verbose = FALSE, features = rownames(merged_filt) ) %>%
RunPCA(npcs = 20, verbose = FALSE)
# Plot befor applying harmony
options(repr.plot.height = 5, repr.plot.width = 12)
p1 <- DimPlot(object = merged_filt, reduction = "pca", pt.size = .1, group.by = "orig.ident")
p2 <- VlnPlot(object = merged_filt, features = "PC_1", group.by = "orig.ident", pt.size = .1)
plot(p1)
plot(p1)
# Apply Harmony Integration:
seurat.HM <- merged_filt %>%
RunHarmony("orig.ident", plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony converged after 5 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details
## on syntax validity
DimPlot(object = seurat.HM, reduction = "harmony", pt.size = .1, group.by = "orig.ident")
# Apply standard workflow for UMAP and clustering
seurat.HM <- seurat.HM %>%
RunUMAP(reduction = "harmony", dims = 1:20) %>%
FindNeighbors(reduction = "harmony", dims = 1:20) %>%
FindClusters(resolution = 0.5) %>%
identity()
## 10:59:55 UMAP embedding parameters a = 0.9922 b = 1.112
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 10:59:55 Read 17437 rows and found 20 numeric columns
## 10:59:55 Using Annoy for neighbor search, n_neighbors = 30
## Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
## Also defined by 'spam'
## 10:59:55 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:59:56 Writing NN index file to temp file /var/folders/ss/g2916l0d47qf7gp8xrgslkmw0000gr/T//RtmpF2DBAC/filebbbf7d259ca6
## 10:59:56 Searching Annoy index using 1 thread, search_k = 3000
## 11:00:00 Annoy recall = 100%
## 11:00:01 Commencing smooth kNN distance calibration using 1 thread
## 11:00:02 Initializing from normalized Laplacian + noise
## 11:00:02 Commencing optimization for 200 epochs, with 778736 positive edges
## 11:00:14 Optimization finished
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 17437
## Number of edges: 584179
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8724
## Number of communities: 8
## Elapsed time: 2 seconds
DimPlot(seurat.HM, reduction = "umap",
split.by = "orig.ident", # this facets the plot
group.by = "seurat_clusters", # labels the cells with values from your group.by variable
label = TRUE)
##---------------------------------------------------------------------------
# Lets do same analysis as we did above for CCA integrated object
##---------------------------------------------------------------------------------
# Finding the marker genes for a cluster vs rest
cluster.markersHM <- FindAllMarkers(seurat.HM, only.pos = TRUE,
min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
# let's take the top 5 marker genes for each cluster and plot as a heatmap
top5 <- cluster.markersHM %>%
group_by(cluster) %>%
top_n(n = 5, wt = avg_log2FC)
DoHeatmap(seurat.HM, features = top5$gene)
# You can extract markers for particular cluster
cluster0HM <- cluster.markersHM %>% filter(cluster == "0")
cluster0HM$pct.diff <- cluster0HM$pct.1 - cluster0HM$pct.2
cluster0HM <- cluster0HM[order(-cluster0HM$pct.diff),]
head(cluster0)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene pct.diff
## MTDH 0.000000e+00 0.5856496 0.926 0.551 0.000000e+00 0 MTDH 0.375
## SRM 5.931359e-282 0.3237898 0.811 0.443 1.186272e-278 0 SRM 0.368
## TPI1 1.395593e-270 0.3930449 0.923 0.572 2.791186e-267 0 TPI1 0.351
## GUK1 1.509491e-306 0.2657783 0.801 0.451 3.018983e-303 0 GUK1 0.350
## AP2M1 2.439654e-270 0.3274912 0.763 0.413 4.879308e-267 0 AP2M1 0.350
## PFN1 0.000000e+00 0.4384106 0.941 0.597 0.000000e+00 0 PFN1 0.344
head(cluster0HM)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene pct.diff
## KCNQ5 0 1.190401 0.747 0.420 0 0 KCNQ5 0.327
## SOX5 0 1.172482 0.852 0.527 0 0 SOX5 0.325
## PDE4D 0 1.139245 0.808 0.490 0 0 PDE4D 0.318
## LRMDA 0 1.081837 0.875 0.575 0 0 LRMDA 0.300
## IMMP2L 0 1.261109 0.952 0.652 0 0 IMMP2L 0.300
## LINC01320 0 1.306881 0.960 0.664 0 0 LINC01320 0.296
# we can plot gene of interest
genes <- c("KCNQ5", "SOX5")
FeaturePlot(seurat.HM,
reduction = "umap",
features = genes,
pt.size = 0.4,
order = TRUE,
split.by = "orig.ident",
min.cutoff = 'q10',
label = FALSE)
VlnPlot(seurat_integrated, features = genes)
# Dot plot of top 2 genes in a cluster
top2 <- cluster.markersHM %>%
group_by(cluster) %>%
top_n(n = 2, wt = avg_log2FC)
DotPlot(seurat.HM, features=unique(top2$gene), dot.scale = 6) +
RotatedAxis() +theme(axis.text.x = element_text(color = "black", size = 10))
## DE markers between two samples
seuratDE <- seurat.HM
Idents(seuratDE) <- "orig.ident"
markersDE <- FindMarkers(object = seuratDE,ident.1 = "A375_treated",
ident.2 = "A375",
min.pct = 0.25)
topDE <- markersDE[(markersDE$avg_log2FC >0.25) | (markersDE$avg_log2FC < -0.25),]
topDE <- topDE[order(-(topDE$avg_log2FC)),]
head(topDE)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## NLGN1 0.000000e+00 1.2194070 0.692 0.380 0.000000e+00
## FXYD3 0.000000e+00 1.1935889 0.392 0.056 0.000000e+00
## IMMP2L 5.713169e-287 0.7962357 0.811 0.645 1.309001e-282
## HMGA2 0.000000e+00 0.7888583 0.649 0.386 0.000000e+00
## FN1 5.064772e-283 0.7857883 0.710 0.486 1.160440e-278
## SOX5 4.681670e-267 0.7775778 0.714 0.507 1.072664e-262
tail(topDE)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## ACTG1 0.000000e+00 -0.7105564 0.942 0.943 0.000000e+00
## LINC01918 1.511004e-106 -0.7269357 0.344 0.463 3.462012e-102
## APOE 3.880942e-162 -0.7566181 0.323 0.491 8.892014e-158
## COX5B 0.000000e+00 -0.8267984 0.802 0.862 0.000000e+00
## CAPG 0.000000e+00 -0.8386470 0.748 0.810 0.000000e+00
## ERCC6L2 0.000000e+00 -1.5870124 0.201 0.577 0.000000e+00
# Upregulated in treated samples
genes <- c("NLGN1", "FXYD3")
FeaturePlot(seuratDE,
reduction = "umap",
features = genes,
pt.size = 0.4,
order = TRUE,
split.by = "orig.ident",
min.cutoff = 'q10',
label = FALSE)
# Downregulated in treated samples
genes <- c("ACTG1", "LINC01918")
FeaturePlot(seuratDE,
reduction = "umap",
features = genes,
pt.size = 0.4,
order = TRUE,
split.by = "orig.ident",
min.cutoff = 'q10',
label = FALSE)
genes <- c("NLGN1", "FXYD3", "ACTG1", "LINC01918")
DotPlot(seuratDE, features=genes, dot.scale = 6) +
RotatedAxis() +theme(axis.text.x = element_text(color = "black", size = 10))
I find Harmony Integration gives better overall result for finding
clusters and markers.
Tutorial:
https://bioconductor.org/packages/release/bioc/vignettes/SingleR/inst/doc/SingleR.html
# First install the required packages
#BiocManager::install(c("scater", "scran","DropletUtils","SingleR","celldex" ))
#devtools::install_github("Irrationone/cellassign")
library(scater) #quality control and visualization for scRNA-seq data
## Loading required package: scuttle
library(scran) #for low level processing of scRNA-seq data
library(DropletUtils)
library(tensorflow)
library(cellassign)
## Loaded Tensorflow version 2.7.0
library(SingleR) #automated cell type annotation ('label transfer') using reference data
library(celldex) #a large collection of reference expression datasets with curated cell type labels for use with SingleR package
##
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
##
## BlueprintEncodeData, DatabaseImmuneCellExpressionData,
## HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
## MouseRNAseqData, NovershternHematopoieticData
library(pheatmap)
# convert to single cell experiment
seurat.HM.sce <- as.SingleCellExperiment(seurat.HM)
# I am using Human Cell Atlas data as reference. You can play around with many data source given in the package.
ref <- HumanPrimaryCellAtlasData()
## snapshotDate(): 2021-10-19
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
predictions <- SingleR(test=seurat.HM.sce, assay.type.test=1,
ref=ref, labels=ref$label.main)
plotScoreHeatmap(predictions)
# Add back to singleCellExperiment object (or Seurat objects)
seurat.HM.sce[["SingleR.labels"]] <- predictions$labels
plotUMAP(seurat.HM.sce, colour_by = "SingleR.labels")
seurat.HM2 <- as.Seurat(seurat.HM.sce, counts = NULL)
DimPlot(seurat.HM2, reduction = "UMAP",
split.by = "orig.ident", # this facets the plot
group.by = "SingleR.labels", # labels the cells with values from your group.by variable
label = TRUE)
#### 2. Using sc-type Tutorial:
https://github.com/IanevskiAleksandr/sc-type
# load libraries and functions
#install.packages("HGNChelper")
lapply(c("dplyr","Seurat","HGNChelper"), library, character.only = T)
## [[1]]
## [1] "pheatmap" "celldex" "SingleR"
## [4] "cellassign" "tensorflow" "DropletUtils"
## [7] "scran" "scater" "scuttle"
## [10] "KernSmooth" "fields" "viridis"
## [13] "viridisLite" "spam" "harmony"
## [16] "Rcpp" "SingleCellExperiment" "SummarizedExperiment"
## [19] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [22] "IRanges" "S4Vectors" "BiocGenerics"
## [25] "stats4" "MatrixGenerics" "matrixStats"
## [28] "cowplot" "DoubletFinder" "dplyr"
## [31] "ggplot2" "SeuratObject" "Seurat"
## [34] "stats" "graphics" "grDevices"
## [37] "utils" "datasets" "methods"
## [40] "base"
##
## [[2]]
## [1] "pheatmap" "celldex" "SingleR"
## [4] "cellassign" "tensorflow" "DropletUtils"
## [7] "scran" "scater" "scuttle"
## [10] "KernSmooth" "fields" "viridis"
## [13] "viridisLite" "spam" "harmony"
## [16] "Rcpp" "SingleCellExperiment" "SummarizedExperiment"
## [19] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [22] "IRanges" "S4Vectors" "BiocGenerics"
## [25] "stats4" "MatrixGenerics" "matrixStats"
## [28] "cowplot" "DoubletFinder" "dplyr"
## [31] "ggplot2" "SeuratObject" "Seurat"
## [34] "stats" "graphics" "grDevices"
## [37] "utils" "datasets" "methods"
## [40] "base"
##
## [[3]]
## [1] "HGNChelper" "pheatmap" "celldex"
## [4] "SingleR" "cellassign" "tensorflow"
## [7] "DropletUtils" "scran" "scater"
## [10] "scuttle" "KernSmooth" "fields"
## [13] "viridis" "viridisLite" "spam"
## [16] "harmony" "Rcpp" "SingleCellExperiment"
## [19] "SummarizedExperiment" "Biobase" "GenomicRanges"
## [22] "GenomeInfoDb" "IRanges" "S4Vectors"
## [25] "BiocGenerics" "stats4" "MatrixGenerics"
## [28] "matrixStats" "cowplot" "DoubletFinder"
## [31] "dplyr" "ggplot2" "SeuratObject"
## [34] "Seurat" "stats" "graphics"
## [37] "grDevices" "utils" "datasets"
## [40] "methods" "base"
# load gene set preparation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
# load cell type annotation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
# DB file
db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx";
tissue = "Brain" # e.g. Immune system, Liver, Pancreas, Kidney, Eye, Brain
# prepare gene sets
gs_list = gene_sets_prepare(db_, tissue)
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
## Maps last updated on: Thu Oct 24 12:31:05 2019
DefaultAssay(seurat.HM)
## [1] "RNA"
# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = seurat.HM[["RNA"]]@scale.data, scaled = TRUE,
gs = gs_list$gs_positive, gs2 = gs_list$gs_negative)
# merge by cluster
cL_resutls = do.call("rbind", lapply(unique(seurat.HM@meta.data$seurat_clusters), function(cl){
es.max.cl = sort(rowSums(es.max[ ,rownames(seurat.HM@meta.data[seurat.HM@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(seurat.HM@meta.data$seurat_clusters==cl)), 10)
}))
sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"
print(sctype_scores[,1:3])
## # A tibble: 8 × 3
## # Groups: cluster [8]
## cluster type scores
## <fct> <chr> <dbl>
## 1 3 Immature neurons 525.
## 2 7 Immature neurons 440.
## 3 2 Unknown -60.9
## 4 6 Unknown -9.16
## 5 1 Cancer cells 1274.
## 6 0 Neural Progenitor cells 1025.
## 7 5 Immature neurons 647.
## 8 4 Endothelial cells 812.
# Overlay the identified cell types on UMAP plot:
seurat.HM@meta.data$customclassif = ""
for(j in unique(sctype_scores$cluster)){
cl_type = sctype_scores[sctype_scores$cluster==j,];
seurat.HM@meta.data$customclassif[seurat.HM@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}
DimPlot(seurat.HM, reduction = "umap", label = TRUE, repel = TRUE,
group.by = 'customclassif')
#### Plot module scores for genes of interest I am going to show some
examples how you can plot the module scores for your gene of interest. I
am interested in cancer related EMT and Human Angiogenesis genes
(ANG)
EMT <- list(c('ABLIM1', 'CDH1', 'COL1A1', 'COL1A2', 'COL3A1', 'COL5A1', 'COL5A2',
'COL6A1', 'COL6A2', 'COL6A3', 'DSP', 'FN1', 'FYN', 'GNAI2', 'JUP',
'LAMA4', 'PDGFRB', 'SERPINB5', 'ZEB2', 'CDH2', 'COL1A1', 'COL3A1',
'COL5A1', 'COL6A1', 'COL6A2', 'EPHA1', 'ERBB3', 'FN1',
'FYN', 'GNAI2', 'LAMA4', 'VIM'))
ANG <- list(c('AKT1', 'ANGPT1', 'ANGPT2', 'ANGPTL3', 'ANGPTL4', 'ANPEP',
'ADGRB1', 'CCL11', 'CCL2', 'CDH5', 'COL18A1', 'COL4A3', 'CXCL1',
'CXCL10', 'CXCL3', 'CXCL5', 'CXCL6', 'CXCL9', 'TYMP', 'S1PR1',
'EFNA1', 'EFNA3', 'EFNB2', 'EGF', 'ENG', 'EPHB4', 'EREG', 'FGF1',
'FGF2', 'FGFR3', 'FIGF', 'FLT1', 'HAND2', 'HGF', 'HIF1A', 'HPSE',
'ID1', 'ID3', 'IFNA1', 'IFNB1', 'IFNG', 'IGF1', 'IL1B', 'IL6',
'CXCL8', 'ITGAV', 'ITGB3', 'JAG1', 'KDR', 'LAMA5', 'LECT1', 'LEP',
'MDK', 'MMP2', 'MMP9', 'NOTCH4', 'NRP1', 'NRP2', 'PDGFA', 'PECAM1',
'PF4', 'PGF', 'PLAU', 'PLG', 'PLXDC1', 'PROK2', 'PTGS1',
'SERPINF1', 'SPHK1', 'STAB1', 'TEK', 'TGFA', 'TGFB1', 'TGFB2',
'TGFBR1', 'THBS1', 'THBS2', 'TIMP1', 'TIMP2', 'TIMP3', 'TNF',
'TNFAIP2', 'VEGFA', 'VEGFC', 'B2M', 'HPRT1', 'RPL13A', 'GAPDH',
'ACTB', 'HGDC', 'RTC', 'RTC', 'RTC', 'PPC', 'PPC', 'PPC'))
gene_list <- list(EMT, ANG)
gene_class <- c("EMT", "ANG")
names(gene_list) <- gene_class
# Add module score to seurat meta data
for (i in gene_class) {
seurat.HM <- AddModuleScore(
object = seurat.HM,
features = gene_list[i],
ctrl = 20,
name = i
)}
## Warning: The following features are not present in the object: c("ABLIM1",
## "CDH1", "COL1A1", "COL1A2", "COL3A1", "COL5A1", "COL5A2", "COL6A1", "COL6A2",
## "COL6A3", "DSP", "FN1", "FYN", "GNAI2", "JUP", "LAMA4", "PDGFRB", "SERPINB5",
## "ZEB2", "CDH2", "COL1A1", "COL3A1", "COL5A1", "COL6A1", "COL6A2", "EPHA1",
## "ERBB3", "FN1", "FYN", "GNAI2", "LAMA4", "VIM"), not searching for symbol
## synonyms
## Warning in AddModuleScore(object = seurat.HM, features = gene_list[i], ctrl =
## 20, : Could not find enough features in the object from the following feature
## lists: EMT Attempting to match case...
## Warning in grep(pattern = paste0("^", s, "$"), x = match, ignore.case = TRUE, :
## argument 'pattern' has length > 1 and only the first element will be used
## Warning: The following features are not present in the object: c("AKT1", "ANGPT1", "ANGPT2", "ANGPTL3", "ANGPTL4", "ANPEP", "ADGRB1", "CCL11", "CCL2", "CDH5", "COL18A1", "COL4A3", "CXCL1", "CXCL10", "CXCL3", "CXCL5", "CXCL6", "CXCL9", "TYMP", "S1PR1", "EFNA1", "EFNA3", "EFNB2", "EGF", "ENG", "EPHB4", "EREG", "FGF1", "FGF2", "FGFR3", "FIGF", "FLT1", "HAND2", "HGF", "HIF1A", "HPSE", "ID1", "ID3", "IFNA1", "IFNB1", "IFNG", "IGF1", "IL1B", "IL6", "CXCL8", "ITGAV", "ITGB3", "JAG1", "KDR", "LAMA5", "LECT1", "LEP", "MDK", "MMP2", "MMP9", "NOTCH4", "NRP1", "NRP2", "PDGFA",
## "PECAM1", "PF4", "PGF", "PLAU", "PLG", "PLXDC1", "PROK2", "PTGS1", "SERPINF1", "SPHK1", "STAB1", "TEK", "TGFA", "TGFB1", "TGFB2", "TGFBR1", "THBS1", "THBS2", "TIMP1", "TIMP2", "TIMP3", "TNF", "TNFAIP2", "VEGFA", "VEGFC", "B2M", "HPRT1", "RPL13A", "GAPDH", "ACTB", "HGDC", "RTC", "RTC", "RTC", "PPC", "PPC", "PPC"), not searching for symbol synonyms
## Warning in AddModuleScore(object = seurat.HM, features = gene_list[i], ctrl =
## 20, : Could not find enough features in the object from the following feature
## lists: ANG Attempting to match case...
## Warning in grep(pattern = paste0("^", s, "$"), x = match, ignore.case = TRUE, :
## argument 'pattern' has length > 1 and only the first element will be used
# Plot Module score
data <- seurat.HM@meta.data[,c("orig.ident","seurat_clusters",
"EMT1","ANG1")]
head(data)
## orig.ident seurat_clusters EMT1 ANG1
## AAACCCAAGGTTCTTG-1_1 A375 3 0.00000000 -0.13509859
## AAACCCAGTGACAGCA-1_1 A375 7 -0.08659321 -0.08659321
## AAACCCAGTGTTTACG-1_1 A375 7 -0.06600254 -0.13200509
## AAACCCAGTTTGTTGG-1_1 A375 3 0.00000000 0.00000000
## AAACCCATCATTGTGG-1_1 A375 3 -0.09301711 0.00000000
## AAACCCATCCATTCAT-1_1 A375 2 -0.14821815 0.00000000
gene_name<- c( "EMT Genes","Angiogenesis Genes")
# Change a (position of gene in gene_name) or you can use a for loop and save corresponding figure using ggsave
a = 1
i = colnames(data)[a+2]
#Cluster plot
df <- data[, c("orig.ident","seurat_clusters", i)]
colnames(df) <- c("orig.ident","seurat_clusters", "y")
ggplot(df, aes(x=seurat_clusters, y=y, fill =orig.ident )) + theme_classic() +
geom_boxplot() +
ggtitle(gene_name[a])+
theme(axis.text.x = element_text(size = 15, face = "bold", angle = 0, vjust = 0.6, hjust=0.5),
axis.text.y = element_text(size = 15, face = "bold", angle = 0, vjust = 0.5, hjust=0)
,axis.title.x=element_text(size=18,face="bold", vjust = 0.5 )
,axis.title.y=element_text(size=18,face="bold", hjust = 0.5, vjust = 1.5 ),
plot.title = element_text(size = 20, face = "bold"),
legend.title=element_text(size=14, face = "bold"))+
stat_summary(fun=mean, geom="point", shape=11, size=1) +
xlab("Clusters") +
ylab("Module Score") +
labs(fill = "Sample") +
theme(plot.title = element_text(hjust = 0.5))
#ggsave(paste0("Gene_annotations/",gene_name[a],"_cluster.pdf"), scale = 0.9)
# Average plot across samples
df <- data[, c("orig.ident","seurat_clusters", i)]
colnames(df) <- c("orig.ident","seurat_clusters", "y")
ggplot(df, aes(x=orig.ident, y=y, fill =orig.ident )) + theme_classic() +
geom_boxplot() +
ggtitle(gene_name[a])+
theme(axis.text.x = element_text(size = 15, face = "bold", angle = 75, vjust = 0.55, hjust=0.55),
axis.text.y = element_text(size = 15, face = "bold", angle = 0, vjust = 0.5, hjust=0)
,axis.title.x=element_text(size=18,face="bold", vjust = 0.5 )
,axis.title.y=element_text(size=18,face="bold", hjust = 0.5, vjust = 1.5 ),
plot.title = element_text(size = 20, face = "bold"),
legend.title=element_text(size=14, face = "bold"))+
stat_summary(fun=mean, geom="point", shape=11, size=1) +
xlab("Samples") +
ylab("Module Score") +
labs(fill = "Sample") +
theme(plot.title = element_text(hjust = 0.5))
https://cole-trapnell-lab.github.io/monocle3/
# Install depencies
# BiocManager::install(c('BiocGenerics', 'DelayedArray', 'DelayedMatrixStats',
# 'limma', 'S4Vectors', 'SingleCellExperiment',
# 'SummarizedExperiment', 'batchelor', 'Matrix.utils'))
# install.packages("devtools")
# devtools::install_github('cole-trapnell-lab/leidenbase')
# It doest not work in my M1 mac, hence I first downloaded gfortran from
# https://github.com/fxcoudert/gfortran-for-macOS/releases, I used the intel version gfortran-Intel-11.3-Monterey.dmg
# You need to install ROSETTA to intstall intel version on M1 apple (https://machow2.com/rosetta-mac/)
# devtools::install_github('cole-trapnell-lab/monocle3')
# devtools::install_github('satijalab/seurat-data')
# remotes::install_github('satijalab/seurat-wrappers')
library(monocle3)
##
## Attaching package: 'monocle3'
## The following objects are masked from 'package:Biobase':
##
## exprs, fData, fData<-, pData, pData<-
library(SeuratData)
library(SeuratWrappers)
library(ggplot2)
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
library(magrittr)
library(scater)
library(SingleCellExperiment)
# Convert Seurat Object to sce object
seurat.HM.sce <- as.SingleCellExperiment(seurat.HM)
cds <- as.cell_data_set(DietSeurat(seurat.HM, graphs = "umap"))
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
##
|
| | 0%
|
|======================================================================| 100%
## Warning in igraph::graph.dfs(stree_ori, root = root_cell, neimode = "all", :
## Argument `neimode' is deprecated; use `mode' instead
##
|
| | 0%
|
|======================================================================| 100%
## Warning in igraph::graph.dfs(stree_ori, root = root_cell, neimode = "all", :
## Argument `neimode' is deprecated; use `mode' instead
plot_cells(cds, color_cells_by = "partition")
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# For interactive Rshiny please uncomment this two line
#bcds <- order_cells(cds, reduction_method = "UMAP")
# plot_cells(cds, color_cells_by = "pseudotime", label_cell_groups=FALSE, label_leaves=FALSE,
# label_branch_points=TRUE,group_label_size = 3, labels_per_group = 3, graph_label_size = 3)
#-------------------------------------------------------------------------------------
# Using endothelial cells as root cell.
#-------------------------------------------------------------------------------------
DimPlot(seurat.HM, reduction = "umap", label = TRUE, repel = TRUE,
group.by = 'customclassif')
end_cells <- rownames(seurat.HM@meta.data[seurat.HM@meta.data$customclassif=="Endothelial cells",])
cds <- order_cells(cds, reduction_method = "UMAP",root_cells = end_cells )
plot_cells(cds,
color_cells_by = "pseudotime",
label_cell_groups=FALSE,
label_leaves=FALSE,
label_branch_points=TRUE,
group_label_size = 1,
labels_per_group = 1,
graph_label_size = 1,
)
That’s the end of this tutorial.